# Always print this out before your assignment
sessionInfo()
getwd()
library('tidyverse')
-- Attaching packages ---------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.1 --
v ggplot2 3.3.5     v purrr   0.3.4
v tibble  3.1.5     v stringr 1.4.0
v tidyr   1.1.4     v forcats 0.5.1
v readr   2.0.2     
-- Conflicts ------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
x readr::col_factor() masks scales::col_factor()
x purrr::discard()    masks scales::discard()
x dplyr::filter()     masks stats::filter()
x dplyr::lag()        masks stats::lag()
library("fs")
library('here')
here() starts at C:/Users/cabrooke/Documents/R/696/group project/final project/final_project
library('dplyr')
library('tidyverse')
library('ggplot2')
library('ggrepel')
library('ggthemes')
library('forcats')
library('rsample')
library('lubridate')

Attaching package: ‘lubridate’

The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union
library('ggthemes')
library('kableExtra')

Attaching package: ‘kableExtra’

The following object is masked from ‘package:dplyr’:

    group_rows
library('pastecs')

Attaching package: ‘pastecs’

The following object is masked from ‘package:tidyr’:

    extract

The following objects are masked from ‘package:dplyr’:

    first, last
library('viridis')
Loading required package: viridisLite

Attaching package: ‘viridis’

The following object is masked from ‘package:scales’:

    viridis_pal
library('plotly')

Attaching package: ‘plotly’

The following object is masked from ‘package:ggplot2’:

    last_plot

The following object is masked from ‘package:stats’:

    filter

The following object is masked from ‘package:graphics’:

    layout
library('tidyquant')
Loading required package: PerformanceAnalytics
Loading required package: xts
Loading required package: zoo

Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric


Attaching package: ‘xts’

The following objects are masked from ‘package:pastecs’:

    first, last

The following objects are masked from ‘package:dplyr’:

    first, last


Attaching package: ‘PerformanceAnalytics’

The following object is masked from ‘package:graphics’:

    legend

Loading required package: quantmod
Loading required package: TTR
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
== Need to Learn tidyquant? ===================================================================================================================================
Business Science offers a 1-hour course - Learning Lab #9: Performance Analysis & Portfolio Optimization with tidyquant!
</> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library('scales')

Final Project Cleaning and Summary Statistics

1a) Loading data


#Reading the data in and doing minor initial cleaning in the function call
#Reproducible data analysis should avoid all automatic string to factor conversions.
#strip.white removes white space 
#na.strings is a substitution so all that have "" will = na
data <- read.csv(here::here("final_project", "donor_data.csv"),
                 stringsAsFactors = FALSE,
                 strip.white = TRUE,
                 na.strings = "")

1b) Fixing the wonky DOB & Data cleanup

glimpse(data_cleaned$zipslry_range)
 logi [1:323000] NA NA NA NA NA NA ...

1c Creating factor variable for sex and married


data_cleaned <- 
  data_cleaned %>% 
  mutate(sex_fct = 
           fct_explicit_na(Sex)
  )


data_cleaned <-
data_cleaned %>% 
mutate(
  sex_simple = 
    fct_lump_n(Sex, n = 4)
)

#checking to see if its a factor
class(data_cleaned$sex_fct)

#checking levels
levels(data_cleaned$sex_simple)

#creating a table against Sex column 
table(data_cleaned$sex_fct, data_cleaned$sex_simple)

#making married a factor 
data_cleaned_columns <- 
  data_cleaned_columns %>% 
  mutate(married_fct = 
           fct_explicit_na(Married)
  )

#checking to see if its a factor
class(data_cleaned$married_fct)

1d #Mean, Median, and Count of Giving in Age Ranges


age_range_giving <- datacleaning %>%
  group_by(age_range) %>%
  summarise(avg_giving = mean(HH.Lifetime.Giving, na.rm = TRUE),
            med_giving = median(HH.Lifetime.Giving, na.rm = TRUE),
            amount_of_people_in_age_range = n())

Question 2

DonorSegment Analysis

#grouping by donorsegment and analyzing 
data_cleaned_columns %>%
  group_by(Donor.Segment) %>%
  summarise(Count = length(Donor.Segment),
            mean_total_giv = mean(HH.Lifetime.Giving)) %>%
  arrange(-Count) %>%
  filter(Count >= 100) %>%
  #added scales package to have the values show in dollar 
  mutate(mean_total_giv = dollar(mean_total_giv)) %>%
  kable(col.names = c("Donor Segment", "Count", "Mean HH Lifetime Giving"), align=rep('c', 3)) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = F)
Donor Segment Count Mean HH Lifetime Giving
NA 232033 $0
Lost Donor 69733 $4,958
Lapsed Donor 11220 $11,195
Current Donor 5704 $104,142
Lapsing Donor 3879 $16,595
At-Risk Donor 657 $85,198
NA
NA

2a) Plotting average giving by age range


ggplot(age_range_giving, aes(avg_giving, age_range)) +
  geom_bar(stat = "identity")

NA
NA

2b) Count of donors based on age range (another way to look at it)


ggplot(datacleaning, 
       aes(age_range)) + 
       geom_bar() + 
       theme(axis.text.x = element_text(angle=45,
                                        hjust=1)) + 
  labs(title = "Count of Age Ranges", x = "", y = "")

NA
NA

2c) Boxplot of the Age Ranges Against the Lifetime Giving Amounts with a log scale applied - the reason we applied log scale is to resolve issues with visualizations that skew towards large values in our dataset.


ggplot(datacleaning, aes(age_range,HH.Lifetime.Giving,fill = age_range)) + 
  geom_boxplot(
  outlier.colour = "red") + 
  scale_y_log10() +
  theme(axis.text.x=element_text(angle=45,hjust=1))
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Removed 232033 rows containing non-finite values (stat_boxplot).

NA
NA

2d) Splitting by age and gender



#creating boxplots 
datacleaning %>% 
  filter(Age < 100) %>% #removing the weird outliers that are over 100 
  filter(Sex %in% c("M", "F")) %>%
  ggplot(aes(Sex, Age)) + 
  geom_boxplot() + 
  theme_economist() + 
  ggtitle("Ages of Donors Based on Gender") + 
  xlab(NULL) + ylab(NULL)

NA
NA
NA
NA
NA

2e) Distribution of people in the states that they live.


  datacleaning %>%
  mutate(State = ifelse(State == " ", "NA", State)) %>%
  filter(State != "NA") %>%
  group_by(State) %>%
  summarise(Count = length(State)) %>%
  filter(Count > 800) %>%
  arrange(-Count) %>%
  kable(col.names = c("Donor's State", "Count")) %>%
  kable_styling(bootstrap_options = c("condensed"),
                full_width = F)
Donor's State Count
CA 176695
WA 7958
TX 7268
NY 5661
CO 5073
AZ 4929
OR 4613
FL 4111
IL 3681
HI 3394
PA 2904
OH 2754
NV 2715
MI 2524
MA 2473
NJ 2311
VA 2158
NC 2087
GA 2045
MO 1889
MN 1732
MD 1488
TN 1443
IN 1417
CT 1380
WI 1330
UT 1174
OK 1151
AL 1120
LA 1110
ID 1096
SC 1076
KY 1032
KS 1027
NM 982
IA 880
NA
NA
NA
NA
NA
NA

2f) Looking at all donors first gift amount. 75% made a first gift of <100.


 no_non_donors <- datacleaning %>%
  filter(Lifetime.Giving != 0)
  
nd <- quantile(no_non_donors$HH.First.Gift.Amount, probs = c(.25,.50,.75,.9,.99), na.rm = TRUE)

nd <- as.data.frame(nd)

nd %>%
  kable(col.names = "Quantile") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = F)
Quantile
25% 3.8
50% 25.0
75% 100.0
90% 500.0
99% 15000.0
NA
NA
NA
NA

Modeling for you

3a) Linear model

#converting married Y and N to 1 and 0 
datacleaning <- datacleaning %>%
      mutate(Married_simple = ifelse(Married == "N",0,1))
 

mod1lm <- lm( Married_simple ~ Lifetime.Giving,
           data = datacleaning)

summary(mod1lm)

Call:
lm(formula = Married_simple ~ Lifetime.Giving, data = datacleaning)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.4107 -0.2872 -0.2872  0.7128  0.7128 

Coefficients:
                      Estimate     Std. Error t value            Pr(>|t|)    
(Intercept)     0.287196453375 0.000796143577  360.73 <0.0000000000000002 ***
Lifetime.Giving 0.000000006818 0.000000007174    0.95               0.342    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4525 on 323224 degrees of freedom
Multiple R-squared:  2.795e-06, Adjusted R-squared:  -2.991e-07 
F-statistic: 0.9033 on 1 and 323224 DF,  p-value: 0.3419
  

Kmeans

is.numeric(data_cleaned$HH.Lifetime.Giving)
[1] TRUE

3a)

p <- datacleaning %>%
  ggplot(aes(Age)) + geom_histogram(bins=30, fill = "blue") + theme_economist_white() +
  ggtitle("Overall Donor Age Distribution") + 
  xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(5,100,by = 20)) +
  scale_y_continuous(breaks = seq(20,100,by = 20)) + xlim(c(20,100))
Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
ggplotly(p)
Warning: Removed 199288 rows containing non-finite values (stat_bin).
  
p
Warning: Removed 199288 rows containing non-finite values (stat_bin).
Warning: Removed 2 rows containing missing values (geom_bar).

ggplot(data = datacleaning, aes(x = Age)) + geom_histogram(fill ="blue")+ xlim(c(20,100))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 199288 rows containing non-finite values (stat_bin).
Warning: Removed 2 rows containing missing values (geom_bar).

NA
NA
NA
---
title: "BROCODE Summary Statistics"
author: "Aaron, Cannon, Josh, Ryan"
subtitle: Final Project Summary Statistics
output:
  html_document:
    df_print: paged
  html_notebook: default
---

```{r setup, include=FALSE}

# Please leave this code chunk as is. It makes some slight formatting changes to alter the output to be more aesthetically pleasing. 

library(knitr)


# Change the number in set seed to your own favorite number
set.seed(1818)
options(width=70)
options(scipen=99)


# this sets text outputted in code chunks to small
opts_chunk$set(tidy.opts=list(width.wrap=50),tidy=TRUE, size = "vsmall")  
opts_chunk$set(message = FALSE,                                          
               warning = FALSE,
               # "caching" stores objects in code chunks and only rewrites if you change things
               cache = TRUE,                               
               # automatically downloads dependency files
               autodep = TRUE,
               # 
               cache.comments = FALSE,
               # 
               collapse = TRUE,
               # change fig.width and fig.height to change the code height and width by default
               fig.width = 5.5,  
               fig.height = 4.5,
               fig.align='center')


```

```{r setup-2}

# Always print this out before your assignment
sessionInfo()
getwd()

```


<!-- ### start answering your problem set here -->
<!-- You may export your homework in either html or pdf, with the former usually being easier. 
     To export or compile your Rmd file: click above on 'Knit' then 'Knit to HTML' -->
<!-- Be sure to submit both your .Rmd file and the compiled .html or .pdf file for full credit -->


```{r setup-3}

# load all your libraries in this chunk 
library('tidyverse')
library("fs")
library('here')
library('dplyr')
library('tidyverse')
library('ggplot2')
library('ggrepel')
library('ggthemes')
library('forcats')
library('rsample')
library('lubridate')
library('ggthemes')
library('kableExtra')
library('pastecs')
library('viridis')
library('plotly')
library('tidyquant')
library('scales')


# note, do not run install.packages() inside a code chunk. install them in the console outside of a code chunk. 

```



## Final Project Cleaning and Summary Statistics 

1a) Loading data

```{r}

#Reading the data in and doing minor initial cleaning in the function call
#Reproducible data analysis should avoid all automatic string to factor conversions.
#strip.white removes white space 
#na.strings is a substitution so all that have "" will = na
data <- read.csv(here::here("final_project", "donor_data.csv"),
                 stringsAsFactors = FALSE,
                 strip.white = TRUE,
                 na.strings = "")

```


1b) Fixing the wonky DOB & Data cleanup

```{r}

#(Birthdate and Age, ID as a number)adding DOB (Age/Spouse Age) in years columns and adding two fields for assignment and number of children
datacleaning <- data %>%
  mutate(Birthdate = ifelse(Birthdate == "0001-01-01", NA, Birthdate)) %>%
  mutate(Birthdate = mdy(Birthdate)) %>%
  mutate(Age = as.numeric(floor(interval(start= Birthdate, end=Sys.Date())/duration(n=1, unit="years")))) %>%
  mutate(Spouse.Birthdate = ifelse(Spouse.Birthdate == "0001-01-01", NA, Spouse.Birthdate)) %>%
  mutate(Spouse.Birthdate = mdy(Spouse.Birthdate)) %>%
  mutate(Spouse.Age = as.numeric(floor(interval(start= Spouse.Birthdate,
                                                end=Sys.Date())/duration(n=1, unit="years")))) %>%
  mutate(ID = as.numeric(ID)) %>% 
  mutate(Assignment_flag = ifelse(is.na(Assignment.Number), 0,1)) %>% 
  mutate( No_of_Children = ifelse(is.na(Child.1.ID),0,
                            ifelse(is.na(Child.2.ID),1,2)))

#splitting up the age into ranges and creating category for easy visualization 
datacleaning <- datacleaning %>%
  mutate(age_range = 
    ifelse(Age %in% 10:19, "10 < 20 year olds",
    ifelse(Age %in% 20:29, "20 < 30 year olds", 
    ifelse(Age %in% 30:39, "30 < 40 year olds",
    ifelse(Age %in% 40:49, "40 < 50 year olds",
    ifelse(Age %in% 50:59, "50 < 60 year olds",
    ifelse(Age %in% 60:69, "60 < 70 year olds",
    ifelse(Age %in% 70:79, "70 < 80 year olds",
    ifelse(Age %in% 80:89, "80 < 90 year olds",
    ifelse(Age %in% 90:99, "90 < 100 year olds",
    ifelse(Age %in% 100:109, "100 < 110 year olds",
    ifelse(Age %in% 110:120, "110 - 120  year olds",
    NA))))))))))))

#splitting zipcode salary into ranges for easy visualization 
data_cleaned <- data_cleaned %>%
  mutate(zipslry_range = 
    ifelse(zipcode_slry %in% 90000:99000, "90K-99K",
    ifelse(zipcode_slry %in% 100000:149000, "100K-149K", 
    ifelse(zipcode_slry %in% 150000:199000, "150K-199K",
    ifelse(zipcode_slry %in% 200000:249000, "200K-249K",
    ifelse(zipcode_slry %in% 250000:299000, "250K-299K",
    ifelse(zipcode_slry %in% 300000:349000, "300K-349K",
    ifelse(zipcode_slry %in% 350000:399000, "350K-399K",
    ifelse(zipcode_slry %in% 400000:499000, "400K-499K",
    ifelse(zipcode_slry %in% 500000:999000, "500K-999K",
    NA))))))))))

glimpse(data_cleaned$zipslry_range)


#seeing what we have
table(datacleaning$age_range)
#50-60 is the most common age range 

#Removing Columns that provide no benefit 

data_cleaned_columns <- subset(datacleaning,select = -c(Assignment.Number
                                                        ,Assignment.has.Historical.Mngr
                                                        ,Suffix
                                                        ,Assignment.Date
                                                        ,Assignment.Manager
                                                        ,Assignment.Role
                                                        ,Assignment.Title
                                                        ,Assignment.Status
                                                        ,Strategy
                                                        ,Progress.Level
                                                        ,Assignment.Group
                                                        ,Assignment.Category
                                                        ,Funding.Method
                                                        ,Expected.Book.Date
                                                        ,Qualification.Amount
                                                        ,Expected.Book.Amount
                                                        ,Expected.Book.Date
                                                        ,Hard.Gift.Total
                                                        ,Soft.Credit.Total
                                                        ,Total.Assignment.Gifts
                                                        ,No.of.Pledges
                                                        ,Proposal..
                                                        ,Proposal.Notes
                                                        ,HH.Life.Hard.Credit
                                                        ,HH.Life.Soft.Credit
                                                        ,HH.Life.Spouse.Credit
                                                        ,Last.Contact.By.Manager
                                                        ,X..of.Contacts.By.Manager))
#cleaning up zip codes removing -4 after 
data_cleaned_columns$Zip <- gsub(data_cleaned_columns$Zip, pattern="-.*", replacement = "")

#adding zip code data and column 
zip <- read.csv(here::here("final_project", "Salary_Zipcode.csv"),
                 stringsAsFactors = FALSE,
                 strip.white = TRUE,
                 na.strings = "")

#adding zip salary column
data_cleaned_columns <-data_cleaned_columns %>%
    mutate(zipcode_slry = VLOOKUP(Zip, zip, NAME, S1902_C03_002E))

#adding scholarship data (y/n)
schlr <- read.csv(here::here("final_project", "scholarship.csv"),
                 stringsAsFactors = FALSE,
                 strip.white = TRUE,
                 na.strings = "")

#adding scholarship column
data_cleaned_columns <-data_cleaned_columns %>%
    mutate(scholarship = VLOOKUP(ID, schlr, ID, SCHOLARSHIP)) 

#replacing NA with 0 
 data_cleaned_columns$scholarship <- replace_na(data_cleaned_columns$scholarship,'0')
 
#replacing Y with 1 
data_cleaned_columns$scholarship<-ifelse(data_cleaned_columns$scholarship=="Y",1,0)

#checking how many are N
table(data_cleaned_columns$scholarship)


#checking and deleting scholarship column 
class(data_cleaned_columns$schlr_fct)
data_cleaned_columns = subset(data_cleaned_columns, select = -c(scholarship))
  
#checking for duplicates N >1 indicates a records values are in the file twice 
data_cleaned_columns %>% group_by(ID) %>% count() %>% arrange(desc(n))

#removing duplicated records

data_cleaned <- unique(data_cleaned_columns)

#n = 1 no ID with multiple records cleaned of dupes
data_cleaned %>% group_by(ID) %>% count() %>% arrange(desc(n))

```

1c Creating factor variable for sex and married 

```{r}

data_cleaned <- 
  data_cleaned %>% 
  mutate(sex_fct = 
           fct_explicit_na(Sex)
  )


data_cleaned <-
data_cleaned %>% 
mutate(
  sex_simple = 
    fct_lump_n(Sex, n = 4)
)

#checking to see if its a factor
class(data_cleaned$sex_fct)

#checking levels
levels(data_cleaned$sex_simple)

#creating a table against Sex column 
table(data_cleaned$sex_fct, data_cleaned$sex_simple)

#making married a factor 
data_cleaned_columns <- 
  data_cleaned_columns %>% 
  mutate(married_fct = 
           fct_explicit_na(Married)
  )

#checking to see if its a factor
class(data_cleaned$married_fct)


```




```{r}



```

1d #Mean, Median, and Count of Giving in Age Ranges 

```{r}

age_range_giving <- datacleaning %>%
  group_by(age_range) %>%
  summarise(avg_giving = mean(HH.Lifetime.Giving, na.rm = TRUE),
            med_giving = median(HH.Lifetime.Giving, na.rm = TRUE),
            amount_of_people_in_age_range = n())



```





## Question 2

DonorSegment Analysis

```{r}
#grouping by donorsegment and analyzing 
data_cleaned_columns %>%
  group_by(Donor.Segment) %>%
  summarise(Count = length(Donor.Segment),
            mean_total_giv = mean(HH.Lifetime.Giving)) %>%
  arrange(-Count) %>%
  filter(Count >= 100) %>%
  #added scales package to have the values show in dollar 
  mutate(mean_total_giv = dollar(mean_total_giv)) %>%
  kable(col.names = c("Donor Segment", "Count", "Mean HH Lifetime Giving"), align=rep('c', 3)) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = F)
  

```

2a) Plotting average giving by age range 


```{r}

ggplot(age_range_giving, aes(avg_giving, age_range)) +
  geom_bar(stat = "identity")


```


2b) Count of donors based on age range (another way to look at it)


```{r}

ggplot(datacleaning, 
       aes(age_range)) + 
       geom_bar() + 
       theme(axis.text.x = element_text(angle=45,
                                        hjust=1)) + 
  labs(title = "Count of Age Ranges", x = "", y = "")
  

```

2c) Boxplot of the Age Ranges Against the Lifetime Giving Amounts with a log scale applied - the reason we applied log scale is to resolve issues with visualizations that skew towards large values in our dataset. 


```{r}

ggplot(datacleaning, aes(age_range,HH.Lifetime.Giving,fill = age_range)) + 
  geom_boxplot(
  outlier.colour = "red") + 
  scale_y_log10() +
  theme(axis.text.x=element_text(angle=45,hjust=1))
  

```

2d) Splitting by age and gender 


```{r}


#creating boxplots 
datacleaning %>% 
  filter(Age < 100) %>% #removing the weird outliers that are over 100 
  filter(Sex %in% c("M", "F")) %>%
  ggplot(aes(Sex, Age)) + 
  geom_boxplot() + 
  theme_economist() + 
  ggtitle("Ages of Donors Based on Gender") + 
  xlab(NULL) + ylab(NULL)
  
  
  


```

2e) Distribution of people in the states that they live.

```{r}

  datacleaning %>%
  mutate(State = ifelse(State == " ", "NA", State)) %>%
  filter(State != "NA") %>%
  group_by(State) %>%
  summarise(Count = length(State)) %>%
  filter(Count > 800) %>%
  arrange(-Count) %>%
  kable(col.names = c("Donor's State", "Count")) %>%
  kable_styling(bootstrap_options = c("condensed"),
                full_width = F)
  
 
  
  


```

2f) Looking at all donors first gift amount. 75% made a first gift of <100. 

```{r}

 no_non_donors <- datacleaning %>%
  filter(Lifetime.Giving != 0)
  
nd <- quantile(no_non_donors$HH.First.Gift.Amount, probs = c(.25,.50,.75,.9,.99), na.rm = TRUE)

nd <- as.data.frame(nd)

nd %>%
  kable(col.names = "Quantile") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = F)
  
  


```



## Modeling for you 


3a) Linear model 

```{r}
#converting married Y and N to 1 and 0 
datacleaning <- datacleaning %>%
      mutate(Married_simple = ifelse(Married == "N",0,1))
 

mod1lm <- lm( Married_simple ~ Lifetime.Giving,
           data = datacleaning)

summary(mod1lm)
  


```
Kmeans
```{r}

pred_vars <- c('married_fct', 'sex_fct') 
 
 
data_cleaned_K <- select(data_cleaned,
                     pred_vars,
                     HH.Lifetime.Giving)
 
#build cluster
dd_kmeans <- kmeans(x = data_cleaned_K, 
                    centers = 5, 
                    nstart = 10)

is.numeric(data_cleaned$HH.Lifetime.Giving)

```





3a) 

```{r}
p <- datacleaning %>%
  ggplot(aes(Age)) + geom_histogram(bins=30, fill = "blue") + theme_economist_white() +
  ggtitle("Overall Donor Age Distribution") + 
  xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(5,100,by = 20)) +
  scale_y_continuous(breaks = seq(20,100,by = 20)) + xlim(c(20,100))

ggplotly(p)
  
p

ggplot(data = datacleaning, aes(x = Age)) + geom_histogram(fill ="blue")+ xlim(c(20,100))

  


```
